This workflow should help us decide if it’s meaningful to run ZINB-WAVE with different values of k to get a consensus clustering or if it’s better to pick “the right k.”
For now I’m using the data that I have stored in my computer, but eventually we should start with the data from GEO. Question: which one? RSEM? Cufflinks? Kallisto?
load("~/git/scone_analysis/data/olfactory_for_scone.rda")
se <- as(scone_obj, "SummarizedExperiment")
colData(se)
## DataFrame with 808 rows and 16 columns
## NREADS NALIGNED RALIGN TOTAL_DUP PRIMER
## <numeric> <numeric> <numeric> <numeric> <numeric>
## OEP01_N706_S501 3313260 3167600 95.6035 47.9943 0.0154566
## OEP01_N701_S501 2902430 2757790 95.0167 45.0150 0.0182066
## OEP01_N707_S507 2307940 2178350 94.3852 43.7832 0.0219196
## OEP01_N705_S501 3337400 3183720 95.3952 43.2688 0.0183041
## OEP01_N704_S507 117892 98628 83.6596 18.0576 0.0623744
## ... ... ... ... ... ...
## OEL23_N704_S510 2407440 2305060 95.7472 47.1489 0.0159111
## OEL23_N705_S502 2308940 2203300 95.4244 62.5638 0.0195812
## OEL23_N706_S502 2215640 2108490 95.1637 50.6643 0.0182207
## OEL23_N704_S503 2673790 2568300 96.0546 60.5481 0.0155611
## OEL23_N703_S502 2450320 2363500 96.4567 48.4164 0.0122704
## PCT_RIBOSOMAL_BASES PCT_CODING_BASES PCT_UTR_BASES
## <numeric> <numeric> <numeric>
## OEP01_N706_S501 2.0e-06 0.200130 0.230654
## OEP01_N701_S501 0.0e+00 0.182461 0.201810
## OEP01_N707_S507 0.0e+00 0.152627 0.207897
## OEP01_N705_S501 2.0e-06 0.169514 0.207342
## OEP01_N704_S507 1.4e-05 0.110724 0.199174
## ... ... ... ...
## OEL23_N704_S510 0e+00 0.287346 0.314104
## OEL23_N705_S502 0e+00 0.337264 0.297077
## OEL23_N706_S502 7e-06 0.244333 0.262663
## OEL23_N704_S503 0e+00 0.343203 0.338217
## OEL23_N703_S502 8e-06 0.259367 0.238239
## PCT_INTRONIC_BASES PCT_INTERGENIC_BASES PCT_MRNA_BASES
## <numeric> <numeric> <numeric>
## OEP01_N706_S501 0.404205 0.165009 0.430784
## OEP01_N701_S501 0.465702 0.150027 0.384271
## OEP01_N707_S507 0.511416 0.128060 0.360524
## OEP01_N705_S501 0.457556 0.165586 0.376856
## OEP01_N704_S507 0.489514 0.200573 0.309898
## ... ... ... ...
## OEL23_N704_S510 0.250658 0.147892 0.601450
## OEL23_N705_S502 0.230214 0.135445 0.634341
## OEL23_N706_S502 0.355899 0.137097 0.506997
## OEL23_N704_S503 0.174696 0.143885 0.681420
## OEL23_N703_S502 0.376091 0.126294 0.497606
## MEDIAN_CV_COVERAGE MEDIAN_5PRIME_BIAS MEDIAN_3PRIME_BIAS
## <numeric> <numeric> <numeric>
## OEP01_N706_S501 0.843857 0.061028 0.521079
## OEP01_N701_S501 0.914370 0.033350 0.373993
## OEP01_N707_S507 0.955405 0.014606 0.491230
## OEP01_N705_S501 0.816630 0.101798 0.525238
## OEP01_N704_S507 1.199780 0.000000 0.706512
## ... ... ... ...
## OEL23_N704_S510 0.698455 0.198224 0.419745
## OEL23_N705_S502 0.830816 0.105091 0.398755
## OEL23_N706_S502 0.805627 0.103363 0.431862
## OEL23_N704_S503 0.745201 0.118615 0.384220
## OEL23_N703_S502 0.711685 0.196725 0.377926
## batch bio
## <factor> <factor>
## OEP01_N706_S501 Y01 K5ERRY_UI_96HPT
## OEP01_N701_S501 Y01 K5ERRY_UI_96HPT
## OEP01_N707_S507 Y01 K5ERRY_UI_96HPT
## OEP01_N705_S501 Y01 K5ERRY_UI_96HPT
## OEP01_N704_S507 Y01 K5ERRY_UI_96HPT
## ... ... ...
## OEL23_N704_S510 P14 K5ERP63CKO_UI_14DPT
## OEL23_N705_S502 P14 K5ERP63CKO_UI_14DPT
## OEL23_N706_S502 P14 K5ERP63CKO_UI_14DPT
## OEL23_N704_S503 P14 K5ERP63CKO_UI_14DPT
## OEL23_N703_S502 P14 K5ERP63CKO_UI_14DPT
labels <- read.table("https://raw.githubusercontent.com/rufletch/p63-HBC-diff/master/ref/oeHBCdiff_clusterLabels.txt", row.names = 1)
cl <- labels[,1]
names(cl) <- rownames(labels)
table(cl)
## cl
## 1 2 3 4 5 7 8 9 10 11 12 14 15
## 91 25 56 40 96 60 28 79 26 22 35 26 32
original <- rep(-1, NCOL(se))
names(original) <- colnames(se)
original[intersect(names(cl), colnames(se))] <- cl[intersect(names(cl), colnames(se))]
palette <- read.table("https://raw.githubusercontent.com/rufletch/p63-HBC-diff/master/ref/oeHBCdiff_colorPalette.txt", comment.char = "%", stringsAsFactors = FALSE)
pal <- c("transparent", palette[,2])
names(pal) <- c("-1", palette[,1])
colData(se)$original_cluster <- as.factor(original)
First of all, we check that with K=2 we get something reasonable.
For now, let’s focus on the 1000 most variable genes, but eventually one question is which genes to use and how to decide.
vars <- rowVars(log1p(assay(se)))
names(vars) <- rownames(se)
vars <- sort(vars, decreasing = TRUE)
core <- se[names(vars)[1:1000],]
core
## class: SummarizedExperiment
## dim: 1000 808
## metadata(1): qc_idx
## assays(1): expression
## rownames(1000): ERCC-00002 ERCC-00096 ... Hnrnph2 Cnot6
## rowData names(5): negcon_ruv negcon_eval poscon_eval
## negcon_validation poscon_validation
## colnames(808): OEP01_N706_S501 OEP01_N701_S501 ... OEL23_N704_S503
## OEL23_N703_S502
## colData names(17): NREADS NALIGNED ... bio original_cluster
zinb2 <- zinbFit(core, X = "~batch", K = 2, ncores = 7)
plot(getW(zinb2), pch=19, col=pal[colData(core)$original_cluster])
This seems fine.
Since we are running clusterExperiment on the first 50 principal components, what happens if we run zinb with k=50?
zinb50 <- zinbFit(core, X = "~batch", K = 50, ncores = 7)
plot(getW(zinb50), pch=19, col=pal[colData(core)$original_cluster])
ks <- 3:10
if(run_zinb) {
zinb_res <- lapply(ks, function(k) zinbFit(core, X = "~batch", K = k, ncores=7))
save(zinb_res, file="../data/zinb_k3_10.rda")
} else {
load("../data/zinb_k3_10.rda")
}
for(i in seq_along(ks)) {
pairs(getW(zinb_res[[i]]), pch=19, col=pal[colData(core)$original_cluster])
}
ws <- lapply(zinb_res, function(x) t(getW(x)))
if(run_cluster) {
cl_res <- clusterMany(ws, ks = 5:15, alphas = c(0.1, 0.3), betas = 0.9,
clusterFunction = "hierarchical01", minSizes=5,
sequential = TRUE, subsample = TRUE, ncores = 7)
save(cl_res, file="../data/cl_k3_10.rda")
} else {
load("../data/cl_k3_10.rda")
}
combined <- combineMany(cl_res$clMat, proportion = 0.5)
dendro <- makeDendrogram(ws[[8]], combined$clustering)
merged <- mergeClusters(assay(core), combined$clustering, dendro$clusters, mergeMethod = "adjP")
clmat <- cbind(merged$clustering, combined$clustering, cl_res$clMat)
plotClusters(clmat)
combined <- combineMany(cl_res$clMat[,grep("dataset8", colnames(cl_res$clMat))],
proportion = 0.5)
dendro <- makeDendrogram(ws[[8]], combined$clustering)
merged <- mergeClusters(assay(core), combined$clustering, dendro$clusters, mergeMethod = "adjP")
clmat <- cbind(merged$clustering, combined$clustering, cl_res$clMat)
plotClusters(clmat)
## matching the parameters used by Russell
W <- t(getW(zinb50))
if(run_cluster) {
cl_res <- clusterMany(W, ks = 4:15, alphas = c(0.1), betas = 0.8,
clusterFunction = "hierarchical01", minSizes=1,
sequential = TRUE, subsample = TRUE, ncores = 7,
subsampleArgs = list(resamp.num=100,
clusterFunction="kmeans",
clusterArgs=list(nstart=10)),
seqArgs = list(k.min=3, top.can=5), verbose=TRUE)
save(cl_res, file="../data/cl_k50.rda")
} else {
load("../data/cl_k50.rda")
}
combined <- combineMany(cl_res, proportion = 0.7)
## Note: no clusters specified to combine, using results from clusterMany
plotClusters(combined, colPalette = c(bigPalette, rainbow(100)))
table(primaryClusterNamed(combined))
##
## -1 c1 c10 c2 c3 c4 c5 c6 c7 c8 c9
## 172 163 5 16 103 123 10 8 124 47 37
sum(primaryCluster(combined) == -1)
## [1] 172
sum(colData(core)$original_cluster == -1)
## [1] 193
table(primaryClusterNamed(combined), colData(core)$original_cluster)
##
## -1 1 2 3 4 5 7 8 9 10 11 12 14 15
## -1 52 39 1 3 5 26 8 27 0 1 6 0 3 1
## c1 50 51 0 2 2 58 0 0 0 0 0 0 0 0
## c10 1 0 0 0 0 0 0 0 0 0 0 0 4 0
## c2 3 1 0 0 0 11 0 1 0 0 0 0 0 0
## c3 11 0 24 50 0 0 0 0 1 0 15 0 2 0
## c4 35 0 0 1 33 1 52 0 1 0 0 0 0 0
## c5 10 0 0 0 0 0 0 0 0 0 0 0 0 0
## c6 8 0 0 0 0 0 0 0 0 0 0 0 0 0
## c7 14 0 0 0 0 0 0 0 75 0 0 35 0 0
## c8 3 0 0 0 0 0 0 0 2 25 0 0 17 0
## c9 6 0 0 0 0 0 0 0 0 0 0 0 0 31
plot(getW(zinb50), pch=19, col=pal[colData(core)$original_cluster])
plot(getW(zinb50), pch=19, col=pal[as.factor(primaryCluster(combined))])
combined <- makeDendrogram(combined)
plotDendrogram(combined)
merged <- mergeClusters(combined, mergeMethod = "locfdr", cutoff = 0.01)
## Note: Merging will be done on ' combineMany ', with clustering index 1
#
# plot(getW(zinb50), pch=19, col=pal[as.factor(primaryCluster(merged))])
Let’s try to run slingshot on W, with different values of k to see how stable the lineages are. We run slingshot in two modes, one in which we specify only the starting cluster (unsupervised) and one in which we specify the end clusters too (supervised).
A note on the interpretation of the clusters.
| Cluster name | Description | Correspondence |
|---|---|---|
| c1 | resting HBC | original 1, 5 |
| c2 | activated HBC | original 5 |
| c3 | GBC / immature neurons / MV 1 | original 2, 3, 11 |
| c4 | Sus | original 4, 7 |
| c5 | new | new |
| c6 | new | new |
| c7 | Neuron | original 9, 12 |
| c8 | Immature Neuron | original 10, 14 |
| c9 | Microvillus | original 15 |
| c10 | Immature Neuron | original 14 |
| Cluster name | Correspondence | Description |
|---|---|---|
| m1 | c1 + c2 | HBC |
| m2 | c3 + c10 | GBC + immature nerons |
| m3 | c4 | Sus |
| m4 | c5 + c6 | new |
| m5 | c7 | Neuron |
| m6 | c8 | Immature Neuron |
| m7 | c9 | Microvillus |
We try with k=3, k=4, k=5.
library(slingshot)
pal2 <- pal[-1][c(1, 5, 3, 4, 2, 10, 11, 9, 13)]
pal3 <- pal[-1][c(1, 3, 4, 2, 11, 9, 13)]
cl_labs <- as.factor(primaryClusterNamed(merged))
cl <- droplevels(cl_labs[cl_labs != "-1"])
for(k in c(3, 4, 5)) {
zinb_k <- zinbFit(core[,cl_labs != "-1"], X = "~batch", K = k, ncores = 7)
X <- getW(zinb_k)
lineages <- get_lineages(X, clus.labels = cl, start.clus = "m1")
curves <- get_curves(X, clus.labels = cl, lineages = lineages)
plot_curves(X, cl, curves, col.clus = pal3)
plot_tree(X, cl, lineages, col.clus = pal3)
print(paste0("K=", k))
print(lineages$lineage1)
print(lineages$lineage2)
print(lineages$lineage3)
print(lineages$lineage4)
print(lineages$lineage5)
}
## [1] "K=3"
## [1] "m1" "m4" "m2" "m6" "m5"
## [1] "m1" "m4" "m2" "m7"
## [1] "m1" "m3"
## NULL
## NULL
## [1] "K=4"
## [1] "m1" "m4" "m2" "m6" "m5"
## [1] "m1" "m4" "m2" "m7"
## [1] "m1" "m3"
## NULL
## NULL
## [1] "K=5"
## [1] "m1" "m3" "m2" "m6" "m5"
## [1] "m1" "m3" "m2" "m7"
## [1] "m1" "m4"
## NULL
## NULL
We try with k=3, k=4, k=5.
for(k in c(3, 4, 5)) {
zinb_k <- zinbFit(core[,cl_labs != "-1"], X = "~batch", K = k, ncores = 7)
X <- getW(zinb_k)
lineages <- get_lineages(X, clus.labels = cl, start.clus = "m1", end.clus = c("m3", "m5", "m7"))
curves <- get_curves(X, clus.labels = cl, lineages = lineages)
plot_curves(X, cl, curves, col.clus = pal3)
plot_tree(X, cl, lineages, col.clus = pal3)
print(paste0("K=", k))
print(lineages$lineage1)
print(lineages$lineage2)
print(lineages$lineage3)
print(lineages$lineage4)
print(lineages$lineage5)
}
## [1] "K=3"
## [1] "m1" "m4" "m2" "m6" "m5"
## [1] "m1" "m4" "m2" "m7"
## [1] "m1" "m3"
## NULL
## NULL
## [1] "K=4"
## [1] "m1" "m4" "m2" "m6" "m5"
## [1] "m1" "m4" "m2" "m7"
## [1] "m1" "m3"
## NULL
## NULL
## [1] "K=5"
## [1] "m1" "m4" "m2" "m6" "m5"
## [1] "m1" "m4" "m2" "m7"
## [1] "m1" "m3"
## NULL
## NULL
sessionInfo()
## R version 3.4.0 (2017-04-21)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Sierra 10.12.4
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] slingshot_0.0.3-5 princurve_1.1-12
## [3] scone_1.1.0 clusterExperiment_1.3.0
## [5] bigmemory_4.5.19 bigmemory.sri_0.1.3
## [7] zinbwave_0.99.1 SummarizedExperiment_1.7.2
## [9] DelayedArray_0.3.5 matrixStats_0.52.2
## [11] Biobase_2.37.2 GenomicRanges_1.29.3
## [13] GenomeInfoDb_1.13.1 IRanges_2.11.2
## [15] S4Vectors_0.15.1 BiocGenerics_0.23.0
##
## loaded via a namespace (and not attached):
## [1] shinydashboard_0.5.3 R.utils_2.5.0
## [3] RSQLite_1.1-2 AnnotationDbi_1.39.0
## [5] htmlwidgets_0.8 grid_3.4.0
## [7] trimcluster_0.1-2 BiocParallel_1.11.1
## [9] RNeXML_2.0.7 DESeq_1.29.0
## [11] munsell_0.4.3 codetools_0.2-15
## [13] statmod_1.4.29 scran_1.5.1
## [15] DT_0.2 miniUI_0.1.1
## [17] colorspace_1.3-2 energy_1.7-0
## [19] knitr_1.15.1 uuid_0.1-2
## [21] pspline_1.0-17 robustbase_0.92-7
## [23] bayesm_3.0-2 NMF_0.20.6
## [25] tximport_1.5.0 GenomeInfoDbData_0.99.0
## [27] hwriter_1.3.2 rhdf5_2.21.0
## [29] rprojroot_1.2 EDASeq_2.11.0
## [31] diptest_0.75-7 R6_2.2.1
## [33] doParallel_1.0.10 ggbeeswarm_0.5.3
## [35] taxize_0.8.4 locfit_1.5-9.1
## [37] flexmix_2.3-14 bitops_1.0-6
## [39] reshape_0.8.6 assertthat_0.2.0
## [41] scales_0.4.1 nnet_7.3-12
## [43] beeswarm_0.2.3 gtable_0.2.0
## [45] phylobase_0.8.4 RUVSeq_1.11.0
## [47] bold_0.4.0 genefilter_1.59.0
## [49] splines_3.4.0 rtracklayer_1.36.0
## [51] lazyeval_0.2.0 hexbin_1.27.1
## [53] rgl_0.98.1 yaml_2.1.14
## [55] reshape2_1.4.2 abind_1.4-5
## [57] GenomicFeatures_1.29.1 backports_1.0.5
## [59] httpuv_1.3.3 tensorA_0.36
## [61] tools_3.4.0 gridBase_0.4-7
## [63] ggplot2_2.2.1 gplots_3.0.1
## [65] RColorBrewer_1.1-2 dynamicTreeCut_1.63-1
## [67] stabledist_0.7-1 Rcpp_0.12.10
## [69] plyr_1.8.4 visNetwork_1.0.3
## [71] progress_1.1.2 zlibbioc_1.23.0
## [73] purrr_0.2.2 RCurl_1.95-4.8
## [75] prettyunits_1.0.2 viridis_0.4.0
## [77] zoo_1.8-0 cluster_2.0.6
## [79] magrittr_1.5 data.table_1.10.4
## [81] RSpectra_0.12-0 mvtnorm_1.0-6
## [83] whisker_0.3-2 gsl_1.9-10.3
## [85] aroma.light_3.7.0 mime_0.5
## [87] evaluate_0.10 xtable_1.8-2
## [89] XML_3.98-1.7 mclust_5.2.3
## [91] gridExtra_2.2.1 compiler_3.4.0
## [93] biomaRt_2.33.1 scater_1.5.0
## [95] tibble_1.3.0 KernSmooth_2.23-15
## [97] R.oo_1.21.0 htmltools_0.3.6
## [99] segmented_0.5-1.4 pcaPP_1.9-61
## [101] tidyr_0.6.2 geneplotter_1.55.0
## [103] howmany_0.3-1 DBI_0.6-1
## [105] MASS_7.3-47 fpc_2.1-10
## [107] MAST_1.3.0 boot_1.3-19
## [109] compositions_1.40-1 ShortRead_1.35.1
## [111] Matrix_1.2-10 ade4_1.7-6
## [113] R.methodsS3_1.7.1 gdata_2.17.0
## [115] igraph_1.0.1 rncl_0.8.2
## [117] GenomicAlignments_1.13.1 registry_0.3
## [119] numDeriv_2016.8-1 locfdr_1.1-8
## [121] plotly_4.6.0 xml2_1.1.1
## [123] foreach_1.4.3 rARPACK_0.11-0
## [125] annotate_1.55.0 vipor_0.4.5
## [127] rngtools_1.2.4 pkgmaker_0.22
## [129] XVector_0.17.0 stringr_1.2.0
## [131] digest_0.6.12 copula_0.999-16
## [133] ADGofTest_0.3 softImpute_1.4
## [135] Biostrings_2.44.0 rmarkdown_1.5
## [137] dendextend_1.5.2 edgeR_3.19.0
## [139] kernlab_0.9-25 shiny_1.0.3
## [141] Rsamtools_1.29.0 gtools_3.5.0
## [143] modeltools_0.2-21 rjson_0.2.15
## [145] nlme_3.1-131 jsonlite_1.4
## [147] viridisLite_0.2.0 limma_3.33.2
## [149] lattice_0.20-35 httr_1.2.1
## [151] DEoptimR_1.0-8 survival_2.41-3
## [153] FNN_1.1 prabclus_2.2-6
## [155] iterators_1.0.8 glmnet_2.0-10
## [157] class_7.3-14 stringi_1.1.5
## [159] mixtools_1.1.0 latticeExtra_0.6-28
## [161] caTools_1.17.1 memoise_1.1.0
## [163] dplyr_0.5.0 ape_4.1